Import packages and data¶

In [1]:
%%capture
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import graphviz

try:
  from prettytable import PrettyTable
  from pypfopt import HRPOpt
except:
  !pip install -q - U prettytable
  !pip install -q -U PyPortfolioOpt
  from pypfopt import HRPOpt
  from prettytable import PrettyTable


from google.colab import drive
from IPython.display import display_html, HTML, display
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

from sklearn.tree import export_graphviz
from IPython.display import Image
from scipy import stats
from statistics import multimode
from scipy.cluster import hierarchy
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.spatial.distance import squareform
from pypfopt import plotting
In [2]:
# get all data from GitHub repo
files = ['er','ed','futures','infswp','sofr','yields','ez_cds']
file_dict = {}

for file in files:
  file_dict[file] = pd.read_csv('https://raw.githubusercontent.com/sjv1030/wq-capstone/main/data/'+file+'.csv')
  file_dict[file]['Date'] = pd.to_datetime(file_dict[file]['Date'], format='%m/%d/%Y')
  file_dict[file].set_index('Date', inplace=True)
  file_dict[file] = file_dict[file].loc[file_dict[file].index.dropna()]
  file_dict[file] = file_dict[file].loc[:'2024'].copy()
  file_dict[file].columns = file_dict[file].columns.str.strip()
  #print(file_dict[file].info())
  #display(file_dict[file].head())

Data transformation¶

In [3]:
# Initial training period (year)
beg_yr = 2010
end_yr = 2012

Convert STIR into yield term¶

Cycle through all contracts and get difference between Eurodollar futures and SOFR futures. For each contract, identify the date where the absolute delta between the Eurodollar and SOFR contract prices is minimized. This date will be used to merge the two contracts in the next step.

In [4]:
merge_ed_sfr = dict()
for k in range(1,13):
  _ = pd.concat([file_dict['ed'][['ED'+str(k)]], file_dict['sofr'][['SFR'+str(k)]]], axis=1)
  _['delta'+str(k)] = np.abs(_['ED'+str(k)] - _['SFR'+str(k)])
  _['delta'+str(k)].loc['2023':'2023'].dropna().plot(legend=True)
  dates = (_['delta'+str(k)].loc['2023':'2023'].index[_['delta'+str(k)].loc['2023':'2023'].dropna().argmin()],
           _['delta'+str(k)].loc['2023':'2023'].index[_['delta'+str(k)].loc['2023':'2023'].dropna().argmin()+1])
  merge_ed_sfr[k] = dates

plt.show()
No description has been provided for this image
In [5]:
# merge ED and SFR futures
backdated_SFR = pd.DataFrame()
for k in range(1,13):
  merge_date, merge_date_1 = merge_ed_sfr[k]
  tmp = pd.concat([file_dict['ed']['ED'+str(k)].loc[:merge_date],
                    file_dict['sofr']['SFR'+str(k)].loc[merge_date_1:]], axis=0
  )
  backdated_SFR['SFR'+str(k)] = tmp

Calculate calendar spreads for the STIR futures. The result is in yield terms.

Short-term interest rate (STIR) futures are quoted in price terms. Given standard contract conventions, 100 less the price equates to a yield in percent form. Arguably, for macro/fixed-income investment ideas, the yield is more important.

Note that a term spread is usually calculated with the longer duration security first.

For example, the US Yield Curve can be defind as 10-year US Treasury less the 2-year US Treasury But with STIR futures, one can reverse the order to get the term spread in yield terms.

Example - Using prices from Jan 9, 2023:

  • SFR4's price was 95.405 (or 4.595%)
  • SFR8's price was 96.790 (or 3.21%)

One can calculate the term spread as SFR4 - SFR8, or -1.385% Note: the term spread is negative because the curve is inverted

In [6]:
# calculate STIR calendar spreads
tmp_dict = {}
for i in range(1,12):
  for j in range(2,13):
    if i < j:
      tmp_dict['ER'+str(i)+'-'+str(j)] = file_dict['er']['ER'+str(i)] - file_dict['er']['ER'+str(j)]
      tmp_dict['SFR'+str(i)+'-'+str(j)] = backdated_SFR['SFR'+str(i)] - backdated_SFR['SFR'+str(j)]

# save data to a dataframe
full_data = pd.DataFrame.from_dict(tmp_dict)
In [7]:
# calculate cross country spreads at key tenors and add to main dataframe
tmp_dict = {}
for i in range(4,13,2):
  tmp_dict['SFRER'+str(i)] = backdated_SFR['SFR'+str(i)] - file_dict['er']['ER'+str(i)]

full_data = full_data.join(pd.DataFrame.from_dict(tmp_dict), how='left')

Create spreads¶

Below is a large dictionary holding all of the various spreads to be used as either a target or within the feature set.

In [8]:
file_dict['spread'] = pd.DataFrame()

file_dict['spread']['US302'] = file_dict['yields']['US30'] - file_dict['yields']['US2']
file_dict['spread']['US305'] = file_dict['yields']['US30'] - file_dict['yields']['US5']
file_dict['spread']['US3010'] = file_dict['yields']['US30'] - file_dict['yields']['US10']
file_dict['spread']['US102'] = file_dict['yields']['US10'] - file_dict['yields']['US2']
file_dict['spread']['US105'] = file_dict['yields']['US10'] - file_dict['yields']['US5']
file_dict['spread']['US52'] = file_dict['yields']['US5'] - file_dict['yields']['US2']

file_dict['spread']['DE302'] = file_dict['yields']['DE30'] - file_dict['yields']['DE2']
file_dict['spread']['DE305'] = file_dict['yields']['DE30'] - file_dict['yields']['DE5']
file_dict['spread']['DE3010'] = file_dict['yields']['DE30'] - file_dict['yields']['DE10']
file_dict['spread']['DE102'] = file_dict['yields']['DE10'] - file_dict['yields']['DE2']
file_dict['spread']['DE105'] = file_dict['yields']['DE10'] - file_dict['yields']['DE5']
file_dict['spread']['DE52'] = file_dict['yields']['DE5'] - file_dict['yields']['DE2']

file_dict['spread']['CA102'] = file_dict['yields']['CA10'] - file_dict['yields']['CA2']
file_dict['spread']['CA105'] = file_dict['yields']['CA10'] - file_dict['yields']['CA5']
file_dict['spread']['CA52'] = file_dict['yields']['CA5'] - file_dict['yields']['CA2']

file_dict['spread']['ITDE10'] = file_dict['yields']['IT10'] - file_dict['yields']['DE10']
file_dict['spread']['USDE2'] = file_dict['yields']['US2'] - file_dict['yields']['DE2']
file_dict['spread']['USDE5'] = file_dict['yields']['US5'] - file_dict['yields']['DE5']
file_dict['spread']['USDE10'] = file_dict['yields']['US10'] - file_dict['yields']['DE10']

file_dict['spread']['USEZINF2'] = file_dict['infswp']['US-2'] - file_dict['infswp']['EZ-2']
file_dict['spread']['USEZINF5'] = file_dict['infswp']['US-5'] - file_dict['infswp']['EZ-5']
file_dict['spread']['USEZINF10'] = file_dict['infswp']['US-10'] - file_dict['infswp']['EZ-10']

file_dict['spread']['USINF102'] = file_dict['infswp']['US-10'] - file_dict['infswp']['US-2']
file_dict['spread']['USINF105'] = file_dict['infswp']['US-10'] - file_dict['infswp']['US-5']
file_dict['spread']['USINF52'] = file_dict['infswp']['US-5'] - file_dict['infswp']['US-2']

file_dict['spread']['EZINF102'] = file_dict['infswp']['EZ-10'] - file_dict['infswp']['EZ-2']
file_dict['spread']['EZINF105'] = file_dict['infswp']['EZ-10'] - file_dict['infswp']['EZ-5']
file_dict['spread']['EZINF52'] = file_dict['infswp']['EZ-5'] - file_dict['infswp']['EZ-2']
In [9]:
# STIR futures strictly follow US trading days, so the other dataframe holding spreads (calculated above) will be merged with this one
full_data = full_data.join(file_dict['spread'], how='left')
full_data = full_data.join(file_dict['yields'], how='left')
full_data = full_data.join(file_dict['infswp'], how='left')
full_data.head()
Out[9]:
ER1-2 SFR1-2 ER1-3 SFR1-3 ER1-4 SFR1-4 ER1-5 SFR1-5 ER1-6 SFR1-6 ... US10 US2 US5 US30 EZ-2 EZ-5 EZ-10 US-2 US-5 US-10
Date
2010-01-04 0.360 0.280 0.73 0.660 1.065 1.075 1.330 1.475 1.570 1.865 ... 3.8155 1.0638 2.6350 4.6431 1.6800 2.1600 2.4410 1.5398 2.3475 2.7818
2010-01-05 0.335 0.245 0.70 0.610 1.025 1.005 1.295 1.385 1.540 1.760 ... 3.7608 1.0080 2.5611 4.6091 1.7025 2.1725 2.4575 1.5556 2.3500 2.7937
2010-01-06 0.330 0.225 0.69 0.585 1.015 0.990 1.290 1.385 1.545 1.770 ... 3.8215 0.9920 2.5913 4.6875 1.7300 2.1950 2.4880 1.5811 2.4050 2.8445
2010-01-07 0.310 0.225 0.66 0.590 0.985 1.005 1.260 1.415 1.525 1.810 ... 3.8235 1.0240 2.6115 4.6855 1.7650 2.2200 2.4880 1.6155 2.3545 2.8551
2010-01-08 0.300 0.190 0.64 0.520 0.955 0.925 1.230 1.330 1.500 1.735 ... 3.8297 0.9759 2.5912 4.7150 1.7600 2.2050 2.4670 1.6061 2.3920 2.8490

5 rows × 184 columns

PCA for EZ CDS¶

In [10]:
# this function will be leveraged in the work-forward analysis
# to test the Italian-German 10-year spread

def make_pca(df, beg, end):
  '''
  Function creates a risk index from PCA loadings.
  '''
  cds = df.ffill().iloc[beg:end].diff().dropna().values
  scaler = StandardScaler()
  scaled_cds = scaler.fit_transform(cds)

  pca = PCA(n_components=3)

  pc = pca.fit_transform(scaled_cds)

  # multiply PCA loadings with scaled underlying data
  scaled_cds_array = np.array(scaled_cds)
  scaled_loadings = np.dot(scaled_cds_array, pca.components_.T)

  # multiply "scaled loadings" with principal component weights, respectively
  # to create one index that combines all three principal components
  # this index had to be negated to make it comparable with underlying CDS data

  EZ_risk_index = -np.dot(scaled_loadings, pca.explained_variance_)
  EZ_risk = pd.DataFrame(EZ_risk_index.cumsum(), index=file_dict['ez_cds'].loc[str(beg):str(end)].index[1:])
  EZ_risk.columns = ['EZ Risk Index']
  return EZ_risk

Use PCA to identify the top principal components (PCs). Plot a chart of explained variance and a biplot showing how the original data maps onto the first 2 PCs.

In [11]:
cds = file_dict['ez_cds'].ffill().loc[str(beg_yr):str(end_yr)].diff().dropna().values
scaler = StandardScaler()
scaled_cds = scaler.fit_transform(cds)

pca = PCA(n_components=3)

pc = pca.fit_transform(scaled_cds)

print(pca.explained_variance_ratio_)
print(pca.explained_variance_ratio_.cumsum())

plt.bar([1,2,3], pca.explained_variance_ratio_.cumsum())
plt.plot([1,2,3], pca.explained_variance_ratio_, color='red', marker='o')
plt.legend(['Individual Variance', 'Cumulative Variance'])
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance % (1=100%)')
plt.ylim([0,1])
plt.xticks(np.arange(1, 3.1, step=1))
plt.show()
[0.81588054 0.05860003 0.04874604]
[0.81588054 0.87448057 0.92322662]
No description has been provided for this image
In [12]:
# visualize the loadings
loadings = pd.DataFrame(pca.components_.T, columns=['PC1', 'PC2', 'PC3'], index=file_dict['ez_cds'].columns)
display(loadings)

def loading_plot(coeff, labels):
    n = coeff.shape[0]
    for i in range(n):
        plt.arrow(0, 0, coeff[i,0], coeff[i,1], head_width = 0.05, head_length = 0.05, color = '#21918C',alpha = 0.5)
        plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, labels[i], color = '#21918C', ha = 'center', va = 'center')
    plt.xlim(-1,1)
    plt.ylim(-1,1)
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.grid()

fig, ax = plt.subplots(figsize = (7,4))
loading_plot(pca.components_.T, file_dict['ez_cds'].columns)
plt.show()
PC1 PC2 PC3
CS -0.341493 0.319370 0.515410
UBS -0.339588 0.367801 0.469703
Unicredit -0.362522 -0.041834 -0.334119
BancoSantander -0.353965 -0.544182 0.199138
BNP -0.360829 0.245916 -0.315533
Intesa -0.360868 -0.045345 -0.394348
SocGen -0.356120 0.284210 -0.267641
BBVA -0.352283 -0.567040 0.189044
No description has been provided for this image
In [13]:
# multiply PCA loadings with scaled underlying data
scaled_cds_array = np.array(scaled_cds)
scaled_loadings = np.dot(scaled_cds_array, pca.components_.T)

# multiply "scaled loadings" with principal component weights, respectively
# to create one index that combines all three principal components
# this index had to be negated to make it comparable with underlying CDS data

EZ_risk_index = -np.dot(scaled_loadings, pca.explained_variance_)

The risk index created using the PCA loadings and scaled CDS data aligns well with the underlying CDS data, suggesting it is capturing a large amount of the common variation.

In [14]:
file_dict['ez_cds'].loc[str(beg_yr):str(end_yr)].plot(legend=True)
plt.title('5-Year CDS of Eurozone Banks')
plt.xlabel('')
plt.ylabel('Basis Points')
plt.show()

plt.plot(file_dict['ez_cds'].loc[str(beg_yr):str(end_yr)].index[1:], EZ_risk_index.cumsum(),
         label='EZ Risk Index', color='k')
plt.title('5-Year CDS of Eurozone Banks vs. EZ Risk Index')
plt.ylabel('Basis Points')
plt.legend()
plt.plot(file_dict['ez_cds'].loc[str(beg_yr):str(end_yr)])
plt.show()
No description has been provided for this image
No description has been provided for this image
In [15]:
# create PCA index and add to larger datafile
EZ_risk = pd.DataFrame(EZ_risk_index.cumsum(), index=file_dict['ez_cds'].loc[str(beg_yr):str(end_yr)].index[1:])
EZ_risk.columns = ['EZ Risk Index']

full_data = full_data.join(EZ_risk, how='left')
full_data.head()
Out[15]:
ER1-2 SFR1-2 ER1-3 SFR1-3 ER1-4 SFR1-4 ER1-5 SFR1-5 ER1-6 SFR1-6 ... US2 US5 US30 EZ-2 EZ-5 EZ-10 US-2 US-5 US-10 EZ Risk Index
Date
2010-01-04 0.360 0.280 0.73 0.660 1.065 1.075 1.330 1.475 1.570 1.865 ... 1.0638 2.6350 4.6431 1.6800 2.1600 2.4410 1.5398 2.3475 2.7818 -4.918199
2010-01-05 0.335 0.245 0.70 0.610 1.025 1.005 1.295 1.385 1.540 1.760 ... 1.0080 2.5611 4.6091 1.7025 2.1725 2.4575 1.5556 2.3500 2.7937 -10.947791
2010-01-06 0.330 0.225 0.69 0.585 1.015 0.990 1.290 1.385 1.545 1.770 ... 0.9920 2.5913 4.6875 1.7300 2.1950 2.4880 1.5811 2.4050 2.8445 -15.339995
2010-01-07 0.310 0.225 0.66 0.590 0.985 1.005 1.260 1.415 1.525 1.810 ... 1.0240 2.6115 4.6855 1.7650 2.2200 2.4880 1.6155 2.3545 2.8551 -20.284589
2010-01-08 0.300 0.190 0.64 0.520 0.955 0.925 1.230 1.330 1.500 1.735 ... 0.9759 2.5912 4.7150 1.7600 2.2050 2.4670 1.6061 2.3920 2.8490 -23.648763

5 rows × 185 columns

EDA¶

In [16]:
# convert dataset to weekly at Friday observations
full_data_wf = full_data.resample('W-FRI').last()
full_data_wf.head()
Out[16]:
ER1-2 SFR1-2 ER1-3 SFR1-3 ER1-4 SFR1-4 ER1-5 SFR1-5 ER1-6 SFR1-6 ... US2 US5 US30 EZ-2 EZ-5 EZ-10 US-2 US-5 US-10 EZ Risk Index
Date
2010-01-08 0.300 0.190 0.640 0.520 0.955 0.925 1.230 1.330 1.500 1.735 ... 0.9759 2.5912 4.7150 1.7600 2.2050 2.4670 1.6061 2.3920 2.8490 -23.648763
2010-01-15 0.255 0.135 0.555 0.425 0.835 0.800 1.080 1.185 1.335 1.560 ... 0.8621 2.4159 4.5814 1.7000 2.1030 2.3680 1.5670 2.2750 2.7300 -6.161920
2010-01-22 0.225 0.110 0.500 0.370 0.780 0.730 1.035 1.100 1.285 1.470 ... 0.7875 2.3414 4.5312 1.6995 2.0470 2.3110 1.5878 2.2090 2.6706 25.244170
2010-01-29 0.265 0.115 0.550 0.355 0.820 0.700 1.060 1.080 1.290 1.460 ... 0.8118 2.3232 4.4884 1.5100 1.8950 2.2982 1.5722 2.2440 2.7591 33.599325
2010-02-05 0.235 0.105 0.445 0.305 0.660 0.610 0.860 0.950 1.070 1.300 ... 0.7633 2.2333 4.5186 1.4375 1.8125 2.1925 1.4932 2.1355 2.6228 57.463195

5 rows × 185 columns

Correlations¶

Intuitively, the various permutations of spreads calculated for the STIR futures above will have a high level of correlation amongst themselves.

As a result, the correlation heatmaps below focused on the yield spreads.

In [17]:
spreads_cols = file_dict['spread'].columns
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(full_data_wf[spreads_cols].loc[str(beg_yr):str(end_yr)].corr(method='pearson').round(2))
plt.title('Correlation on levels ('+str(beg_yr)+'-'+str(end_yr)+')')
plt.show()

fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(full_data_wf[spreads_cols].loc[str(beg_yr):str(end_yr)].diff().corr(method='pearson').round(2))
plt.title('Correlation on first difference ('+str(beg_yr)+'-'+str(end_yr)+')')
plt.show()
No description has been provided for this image
No description has been provided for this image

Feature Selection via Correlations¶

The absolute value of pearson correlation is used to select the top securities whose values are above 0.7 (with a maximum of 10 features per target variable). This process will occur every time the model is updated during the walk-forward backtest.

In [18]:
target_list = []
for col in full_data_wf.columns:
  if col in file_dict['ed'].columns:
    continue
  elif col in file_dict['sofr'].columns:
    continue
  elif col in file_dict['yields'].columns:
    continue
  elif col in file_dict['ez_cds'].columns:
    continue
  elif col in file_dict['infswp'].columns:
    continue
  elif col == 'EZ Risk Index':
    continue
  else:
     target_list.append(col)
In [19]:
#target_list = ['US102','DE52','USDE10','SFR4-8','SFR6-12','ER4-8','ER6-12','USDE5',
#               'SFR2-8','ER2-8','ITDE10']

#target_list = spreads_cols.copy()
feature_dict = dict()
for tgt in target_list:
  _corr = full_data_wf.loc[str(beg_yr):str(end_yr)].diff().corr(method='pearson')[tgt]
  feature_dict[tgt] = _corr.abs().sort_values(ascending=False).where(lambda x: x > 0.7).dropna().index.tolist()[1:11]

The table below shows the target as column headings and the features (in the rows) that have the highest correlation (either positive or negative).

In [20]:
#feat_df = pd.DataFrame.from_dict(feature_list)
feat_df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in feature_dict.items() ]))
display(HTML(feat_df[spreads_cols].to_html()))
US302 US305 US3010 US102 US105 US52 DE302 DE305 DE3010 DE102 DE105 DE52 CA102 CA105 CA52 ITDE10 USDE2 USDE5 USDE10 USEZINF2 USEZINF5 USEZINF10 USINF102 USINF105 USINF52 EZINF102 EZINF105 EZINF52
0 US30 US3010 US305 US10 US305 US5 DE102 DE302 DE305 DE302 DE305 DE102 CA52 NaN CA102 IT10 SFRER4 SFRER8 USDE5 US-2 NaN NaN USINF52 NaN USINF102 EZINF52 NaN EZINF102
1 US105 US105 NaN US30 US302 US10 DE305 DE105 NaN DE52 DE102 NaN NaN NaN NaN NaN DE2 SFRER10 NaN NaN NaN NaN US-2 NaN US-2 EZ-2 NaN NaN
2 US102 US302 NaN US302 NaN SFR2-12 DE105 DE3010 NaN DE105 DE302 NaN NaN NaN NaN NaN SFRER6 USDE10 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 US305 NaN NaN US52 NaN SFR1-12 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN USDE5 SFRER6 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 NaN NaN NaN UK10 NaN SFR4-12 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN DE5 USDE2 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
5 NaN NaN NaN SFR8-12 NaN SFR3-12 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN SFRER12 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
6 NaN NaN NaN SFR7-12 NaN SFR5-12 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
7 NaN NaN NaN SFR9-12 NaN SFR4-11 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
8 NaN NaN NaN SFR6-12 NaN US102 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
9 NaN NaN NaN NaN NaN SFR5-11 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

Dendrogram¶

Hierarchical clustering is used to group stocks that have higher correlations. This approach measures the "distance" between values, which is defined as the distance between correlation coefficients. The farther the distance, then the more different (or dissimilar) the two values are from each other.

Given a matrix of correlation coefficients, the distance will be calculated as: $\sqrt{0.5 * ( 1 - \rho_{i,j})}$

The dendrogram below has clustered the stocks based on this distance measure. The x-axis labels are the sorted columns from the correlation matrix.

In [21]:
tgt_corr = full_data_wf[spreads_cols].loc[str(beg_yr):str(end_yr)].diff().corr(method='pearson').round(2).corr()
fig, ax = plt.subplots(figsize=(12, 5))
distance = np.sqrt(0.5 * (1 - tgt_corr))
Z = linkage(squareform(distance), 'complete')

dendrogram(Z, labels=tgt_corr.columns, orientation='top', leaf_rotation=90)
x_lab = ax.get_xticklabels()
plt.show()
No description has been provided for this image

Plots of raw data¶

Given the high number of features, only some will be charted for illustration.

In [22]:
# Plot example of elevated term spread during periods of high macro risk
# like the Italian-German 10-year spread during the Eurozone debt crisis in 2011
plt.figure(figsize=(8,3))
full_data_wf['ITDE10'].plot(title='ITDE10')
plt.ylabel('Spread in %')
plt.xlabel('')
plt.show()
No description has been provided for this image
In [23]:
for c, col in enumerate(spreads_cols):
    if c % 10 == 0:
      plt.figure(figsize=(8,3))
      full_data_wf[col].plot(title=col)
      plt.ylabel('Spread in %')
      plt.xlabel('')
      plt.show()
#for c, col in enumerate(full_data_wf.drop(spreads_cols, axis=1).columns):
#    if c % 15 == 0:
#      plt.figure(figsize=(8,3))
#      full_data_wf[col].plot(title=col)
#      plt.ylabel('Spread in %')
#      plt.xlabel('')
#      plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Check for stationarity¶

In [24]:
def adf_test(x):
    dftest = adfuller(x.dropna(),autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Numer of Observations Used'])
    print(dfoutput.round(3))

def run_eda(df):
  for col in df.columns:
    fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(12,3))

    df[col].plot(title=col, ax=axes[0])
    sns.histplot(ax=axes[1], data=df,x=col)
    axes[1].set_title('Distribution')
    plot_acf(df[col].dropna(), ax=axes[2])
    axes[2].set_title('Autocorrelation')
    plt.tight_layout()

  for col in df.columns:
    print('Running ADF on '+col+'\n')
    adf_test(df[col])
    print('----------------------------------------')

Run ADF on all the target variables. While the target does not need to be stationary (since it will be converted to a categorical variable), the ADF will help inform if lags of the target should be included in the feature dataset.

The results suggest all target variables have serial correlation and are not stationary.

Note that the data was contained to 2010-2015 to avoid look ahead bias.

In [25]:
for tgt in target_list:
  run_eda(full_data_wf[[tgt]].loc[str(beg_yr) : str(end_yr)].diff()[1:])
Running ADF on ER1-2

Test Statistic                -10.509
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on SFR1-2

Test Statistic                 -9.175
p-value                         0.000
#Lags Used                      2.000
Numer of Observations Used    152.000
dtype: float64
----------------------------------------
Running ADF on ER1-3

Test Statistic                -10.479
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on SFR1-3

Test Statistic                 -9.275
p-value                         0.000
#Lags Used                      2.000
Numer of Observations Used    152.000
dtype: float64
----------------------------------------
Running ADF on ER1-4

Test Statistic                -10.516
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on SFR1-4

Test Statistic                -13.519
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on ER1-5

Test Statistic                -10.293
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on SFR1-5

Test Statistic                -13.078
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on ER1-6

Test Statistic                -10.145
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on SFR1-6

Test Statistic                -12.903
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on ER1-7

Test Statistic                -10.086
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on SFR1-7

Test Statistic                -12.876
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on ER1-8

Test Statistic                -10.18
p-value                         0.00
#Lags Used                      1.00
Numer of Observations Used    153.00
dtype: float64
----------------------------------------
Running ADF on SFR1-8

Test Statistic                -12.934
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on ER1-9

Test Statistic                -10.215
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on SFR1-9

Test Statistic                -12.981
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on ER1-10

Test Statistic                -10.322
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on SFR1-10

Test Statistic                -13.041
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on ER1-11

Test Statistic                -10.342
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on SFR1-11

Test Statistic                -13.077
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
<ipython-input-24-74079bfd30a4>:8: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`). Consider using `matplotlib.pyplot.close()`.
  fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(12,3))
Running ADF on ER1-12

Test Statistic                -10.401
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on SFR1-12

Test Statistic                -13.199
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on ER2-3

Test Statistic                -10.727
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on SFR2-3

Test Statistic                -13.641
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on ER2-4

Test Statistic                -10.422
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on SFR2-4

Test Statistic                -13.18
p-value                         0.00
#Lags Used                      0.00
Numer of Observations Used    154.00
dtype: float64
----------------------------------------
Running ADF on ER2-5

Test Statistic                 -9.941
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on SFR2-5

Test Statistic                -12.732
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on ER2-6

Test Statistic                 -9.753
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on SFR2-6

Test Statistic                -12.683
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on ER2-7

Test Statistic                -12.539
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on SFR2-7

Test Statistic                -12.778
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on ER2-8

Test Statistic                -12.918
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on SFR2-8

Test Statistic                 -8.135
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on ER2-9

Test Statistic                -13.345
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on SFR2-9

Test Statistic                 -8.091
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on ER2-10

Test Statistic                -13.689
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on SFR2-10

Test Statistic                 -8.112
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on ER2-11

Test Statistic                -13.743
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on SFR2-11

Test Statistic                 -8.171
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on ER2-12

Test Statistic                -13.789
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on SFR2-12

Test Statistic                 -5.896
p-value                         0.000
#Lags Used                      2.000
Numer of Observations Used    152.000
dtype: float64
----------------------------------------
Running ADF on ER3-4

Test Statistic                -10.006
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on SFR3-4

Test Statistic                -12.782
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on ER3-5

Test Statistic                -12.756
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on SFR3-5

Test Statistic                 -7.179
p-value                         0.000
#Lags Used                      3.000
Numer of Observations Used    151.000
dtype: float64
----------------------------------------
Running ADF on ER3-6

Test Statistic                -13.111
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on SFR3-6

Test Statistic                 -7.056
p-value                         0.000
#Lags Used                      3.000
Numer of Observations Used    151.000
dtype: float64
----------------------------------------
Running ADF on ER3-7

Test Statistic                -13.173
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on SFR3-7

Test Statistic                 -7.832
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on ER3-8

Test Statistic                -13.487
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on SFR3-8

Test Statistic                 -7.81
p-value                         0.00
#Lags Used                      1.00
Numer of Observations Used    153.00
dtype: float64
----------------------------------------
Running ADF on ER3-9

Test Statistic                -13.882
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on SFR3-9

Test Statistic                 -7.804
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on ER3-10

Test Statistic                -14.084
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on SFR3-10

Test Statistic                 -7.851
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on ER3-11

Test Statistic                -13.971
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on SFR3-11

Test Statistic                 -7.94
p-value                         0.00
#Lags Used                      1.00
Numer of Observations Used    153.00
dtype: float64
----------------------------------------
Running ADF on ER3-12

Test Statistic                -13.848
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on SFR3-12

Test Statistic                 -5.845
p-value                         0.000
#Lags Used                      2.000
Numer of Observations Used    152.000
dtype: float64
----------------------------------------
Running ADF on ER4-5

Test Statistic                -13.529
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on SFR4-5

Test Statistic                 -7.274
p-value                         0.000
#Lags Used                      3.000
Numer of Observations Used    151.000
dtype: float64
----------------------------------------
Running ADF on ER4-6

Test Statistic                -13.722
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on SFR4-6

Test Statistic                 -5.405
p-value                         0.000
#Lags Used                      4.000
Numer of Observations Used    150.000
dtype: float64
----------------------------------------
Running ADF on ER4-7

Test Statistic                -13.617
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on SFR4-7

Test Statistic                 -7.615
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on ER4-8

Test Statistic                -13.941
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on SFR4-8

Test Statistic                 -7.618
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on ER4-9

Test Statistic                -14.273
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on SFR4-9

Test Statistic                 -7.641
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on ER4-10

Test Statistic                -14.305
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on SFR4-10

Test Statistic                 -7.712
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on ER4-11

Test Statistic                -10.373
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on SFR4-11

Test Statistic                 -7.834
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on ER4-12

Test Statistic                -10.381
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on SFR4-12

Test Statistic                 -7.911
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on ER5-6

Test Statistic                -13.701
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on SFR5-6

Test Statistic                 -5.116
p-value                         0.000
#Lags Used                      4.000
Numer of Observations Used    150.000
dtype: float64
----------------------------------------
Running ADF on ER5-7

Test Statistic                -13.662
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on SFR5-7

Test Statistic                 -7.482
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on ER5-8

Test Statistic                -14.151
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on SFR5-8

Test Statistic                 -7.554
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on ER5-9

Test Statistic                -14.472
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on SFR5-9

Test Statistic                 -7.61
p-value                         0.00
#Lags Used                      1.00
Numer of Observations Used    153.00
dtype: float64
----------------------------------------
Running ADF on ER5-10

Test Statistic                -10.526
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on SFR5-10

Test Statistic                 -7.727
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on ER5-11

Test Statistic                -10.547
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on SFR5-11

Test Statistic                 -7.891
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on ER5-12

Test Statistic                -10.47
p-value                         0.00
#Lags Used                      1.00
Numer of Observations Used    153.00
dtype: float64
----------------------------------------
Running ADF on SFR5-12

Test Statistic                 -5.919
p-value                         0.000
#Lags Used                      2.000
Numer of Observations Used    152.000
dtype: float64
----------------------------------------
Running ADF on ER6-7

Test Statistic                -13.49
p-value                         0.00
#Lags Used                      0.00
Numer of Observations Used    154.00
dtype: float64
----------------------------------------
Running ADF on SFR6-7

Test Statistic                 -7.693
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on ER6-8

Test Statistic                -14.554
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on SFR6-8

Test Statistic                 -7.722
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on ER6-9

Test Statistic                -10.613
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on SFR6-9

Test Statistic                 -7.766
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on ER6-10

Test Statistic                -10.612
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on SFR6-10

Test Statistic                 -7.928
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on ER6-11

Test Statistic                -10.525
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on SFR6-11

Test Statistic                 -8.131
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on ER6-12

Test Statistic                -10.398
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on SFR6-12

Test Statistic                 -8.246
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on ER7-8

Test Statistic                -10.44
p-value                         0.00
#Lags Used                      1.00
Numer of Observations Used    153.00
dtype: float64
----------------------------------------
Running ADF on SFR7-8

Test Statistic                 -7.941
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on ER7-9

Test Statistic                -10.776
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on SFR7-9

Test Statistic                 -7.994
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on ER7-10

Test Statistic                -10.58
p-value                         0.00
#Lags Used                      1.00
Numer of Observations Used    153.00
dtype: float64
----------------------------------------
Running ADF on SFR7-10

Test Statistic                 -8.265
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on ER7-11

Test Statistic                -10.504
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on SFR7-11

Test Statistic                 -8.527
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on ER7-12

Test Statistic                -10.335
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on SFR7-12

Test Statistic                -15.165
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on ER8-9

Test Statistic                -11.058
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on SFR8-9

Test Statistic                 -8.207
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on ER8-10

Test Statistic                -10.709
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on SFR8-10

Test Statistic                -15.382
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on ER8-11

Test Statistic                -10.59
p-value                         0.00
#Lags Used                      1.00
Numer of Observations Used    153.00
dtype: float64
----------------------------------------
Running ADF on SFR8-11

Test Statistic                -15.389
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on ER8-12

Test Statistic                -10.395
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on SFR8-12

Test Statistic                -15.506
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on ER9-10

Test Statistic                -10.041
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on SFR9-10

Test Statistic                -15.419
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on ER9-11

Test Statistic                -10.327
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on SFR9-11

Test Statistic                -15.539
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on ER9-12

Test Statistic                -10.199
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on SFR9-12

Test Statistic                 -4.744
p-value                         0.000
#Lags Used                      5.000
Numer of Observations Used    149.000
dtype: float64
----------------------------------------
Running ADF on ER10-11

Test Statistic                -10.194
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on SFR10-11

Test Statistic                 -4.811
p-value                         0.000
#Lags Used                      7.000
Numer of Observations Used    147.000
dtype: float64
----------------------------------------
Running ADF on ER10-12

Test Statistic                -10.176
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on SFR10-12

Test Statistic                 -4.919
p-value                         0.000
#Lags Used                      7.000
Numer of Observations Used    147.000
dtype: float64
----------------------------------------
Running ADF on ER11-12

Test Statistic                -10.022
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on SFR11-12

Test Statistic                -15.391
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on SFRER4

Test Statistic                -13.316
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on SFRER6

Test Statistic                -13.083
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on SFRER8

Test Statistic                 -8.515
p-value                         0.000
#Lags Used                      3.000
Numer of Observations Used    151.000
dtype: float64
----------------------------------------
Running ADF on SFRER10

Test Statistic                 -8.183
p-value                         0.000
#Lags Used                      3.000
Numer of Observations Used    151.000
dtype: float64
----------------------------------------
Running ADF on SFRER12

Test Statistic                -13.338
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on US302

Test Statistic                -15.346
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on US305

Test Statistic                -13.64
p-value                         0.00
#Lags Used                      0.00
Numer of Observations Used    154.00
dtype: float64
----------------------------------------
Running ADF on US3010

Test Statistic                -12.648
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on US102

Test Statistic                 -8.204
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on US105

Test Statistic                -14.196
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on US52

Test Statistic                 -6.032
p-value                         0.000
#Lags Used                      2.000
Numer of Observations Used    152.000
dtype: float64
----------------------------------------
Running ADF on DE302

Test Statistic                -10.914
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on DE305

Test Statistic                -10.856
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on DE3010

Test Statistic                -10.591
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on DE102

Test Statistic                 -4.594
p-value                         0.000
#Lags Used                      5.000
Numer of Observations Used    149.000
dtype: float64
----------------------------------------
Running ADF on DE105

Test Statistic                -12.363
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on DE52

Test Statistic                 -8.98
p-value                         0.00
#Lags Used                      2.00
Numer of Observations Used    152.00
dtype: float64
----------------------------------------
Running ADF on CA102

Test Statistic                -10.887
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on CA105

Test Statistic                -14.772
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on CA52

Test Statistic                -13.536
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on ITDE10

Test Statistic                -13.033
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on USDE2

Test Statistic                -12.542
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on USDE5

Test Statistic                 -4.988
p-value                         0.000
#Lags Used                      8.000
Numer of Observations Used    146.000
dtype: float64
----------------------------------------
Running ADF on USDE10

Test Statistic                 -7.119
p-value                         0.000
#Lags Used                      4.000
Numer of Observations Used    150.000
dtype: float64
----------------------------------------
Running ADF on USEZINF2

Test Statistic                -13.366
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on USEZINF5

Test Statistic                -15.032
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on USEZINF10

Test Statistic                -17.683
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on USINF102

Test Statistic                -11.303
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on USINF105

Test Statistic                 -4.939
p-value                         0.000
#Lags Used                      7.000
Numer of Observations Used    147.000
dtype: float64
----------------------------------------
Running ADF on USINF52

Test Statistic                -10.194
p-value                         0.000
#Lags Used                      0.000
Numer of Observations Used    154.000
dtype: float64
----------------------------------------
Running ADF on EZINF102

Test Statistic                -10.929
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
Running ADF on EZINF105

Test Statistic                 -4.881
p-value                         0.000
#Lags Used                      6.000
Numer of Observations Used    148.000
dtype: float64
----------------------------------------
Running ADF on EZINF52

Test Statistic                -12.958
p-value                         0.000
#Lags Used                      1.000
Numer of Observations Used    153.000
dtype: float64
----------------------------------------
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Model Selection & Backtest¶

Test various classification models (i.e., SVC, Logit, Random Forest) to see which model best predicts an up or down move in a position in a walk-forward analysis.

In [26]:
DV01 = 10000
aum = 1000000
look_back = 3

beg = 0
end = beg+(52*look_back)

# capture positions for each trade to be used in portfolio construction later
position_dict = dict()
In [27]:
def get_features (df, beg, end, target):
  '''
  Function takes a pandas dataframe and calculates a correlation matrix
  Using the absolute value of the correlation coefficient, the matrix is sorted
  and filtered to only show correlations above 0.7 up to maximum of 10 relationships.

  Returns the names of the securities with high correlations.
  '''
  _corr = df.iloc[beg:end].diff().corr(method='pearson')[target].abs()
  _feats = _corr.abs().sort_values(by=target, ascending=False).where(lambda x: x > 0.7).dropna().index.tolist()[1:11]
  return _feats

def lag_feature(df, features, target):
  '''
  Function takes a pandas dataframe and inserts lags of all columns.
  Returns a dataframe with all lags inserted, first differenced,
  and a target series as a categorical variable.
  '''
  df = df[features + target].ffill().copy()
  tmp_dict = dict()
  for col in df.columns:
    for l in range(1,5):
      tmp_col = df[col].shift(l).copy()
      tmp_dict[col+'L'+str(l)] = tmp_col
  df = pd.concat([df,
                 pd.DataFrame.from_dict(tmp_dict)],
                 axis=1
  )
  df[target] = df[target].shift(-1)
  df.dropna(inplace=True)

  X = df.diff()[1:].ffill().drop(target, axis=1)
  y = df[target].diff()[1:].ffill()

  upper = 0.03 # +0.03% or up 3 basis points
  lower = -0.03 # -0.03% or down 3 basis points
  y = pd.DataFrame(np.where(y > upper, 1, np.where(y < lower, -1, 0)))
  return X, y



def filter(X, y, beg, end):
  '''
  Function takes the target and features, as well as
  beginning and ending index values as integers.

  Returns target and features over selected time period.
  '''
  X = X.iloc[beg : end].copy()
  y = pd.DataFrame(y).iloc[beg : end].values
  return X, y

def train_logit(X, y):
  '''
  Function takes a feature dataframe and target series,
  and returns a logistic regression model.
  '''
  log = LogisticRegression(solver='saga', multi_class='multinomial')
  log.fit(X, np.reshape(y,-1))
  return log

def valid_logit(X, y, model):
  '''
  Function takes a feature dataframe, target series, and logistic regression
  model, and returns the model score.
  '''
  y_pred = model.predict(X)
  score = model.score(X, y)
  return score

def train_valid_rfc(train_X, train_y, valid_X, valid_y):
  '''
  Function takes a feature dataframe and target series for both the training
  period and validation period.

  A Random Forest Classifier model is trained with some hyperparameter tuning.
  The best model is chosen with the highest F1 score using the validation data.

  The function returns the best model, score, and number of estimators.
  '''
  best_n = 0
  best_score = 0
  for i in range(100, 300, 50):
    rfc = RandomForestClassifier(random_state=51, n_estimators=i)
    rfc.fit(train_X, np.reshape(train_y,-1))
    y_pred = rfc.predict(valid_X)
    tmpf1 = metrics.f1_score(valid_y, y_pred, average='weighted')
    if tmpf1 > best_score:
      best_score = tmpf1
      best_n = i
      best_model = rfc
  return best_model, best_score, best_n

def train_valid_svc(train_X, train_y, valid_X, valid_y):
  '''
  Function takes a feature dataframe and target series for both the training
  period and validation period.

  A Support Vector Classification model is trained with some hyperparameter
  tuning. The best model is chosen with the highest F1 score using the
  validation data.

  The function returns the best model, score, and regularization parameter.
  '''
  best_n = 0
  best_score = 0
  for i in range(1, 1000, 50):
    svc_model = SVC(random_state=51, C=i, kernel='rbf', gamma='auto')
    svc_model.fit(train_X, np.reshape(train_y,-1))
    y_pred = svc_model.predict(valid_X)
    tmpf1 = metrics.f1_score(valid_y, y_pred, average='weighted')
    if tmpf1 > best_score:
      best_score = tmpf1
      best_n = i
      best_model = svc_model
  return best_model, best_score, best_n

def get_bt_stats(_dict, target, DV01, aum):
  '''
  Show backtest results
  '''
  positions = pd.DataFrame.from_dict(_dict, orient='index')
  positions.columns = ['Position']
  positions = positions.join(full_data_wf[target].diff().loc[positions.index], how='left')
  positions['Position'] = positions['Position'].shift(1).fillna(0)
  positions['cum_position'] = positions['Position']

  for r in range(1, positions.shape[0]):
    if positions['cum_position'].iloc[r-1] == 1 and positions['Position'].iloc[r] == 0:
      positions['cum_position'].iloc[r] = 1
    if positions['cum_position'].iloc[r-1] == -1 and positions['Position'].iloc[r] == 0:
      positions['cum_position'].iloc[r] = -1


  positions['trade'] = positions[['cum_position']] *  positions[target].values
  positions['pnl'] = positions['trade'] * 100 * DV01
  positions['pnl'].iloc[0] = aum
  positions['cum_pnl'] = positions['pnl'].cumsum()
  positions['return'] = positions['cum_pnl'].pct_change()
  positions['neg_return'] = positions['return'].where(positions['return'] < 0, np.nan)
  positions['cum_return'] = np.cumprod(1 + positions['return'].fillna(0).values)

  positions['rolling_max'] = positions['cum_pnl'].cummax()
  positions['weekly_drawdown'] = positions['cum_pnl']/positions['rolling_max'] - 1.0
  positions['max_drawdown'] = positions['weekly_drawdown'].cummin()

  for r in range(2, positions.shape[0]):
    if round(positions['cum_return'].iloc[r-1],2) <= 0.0:
      positions['cum_return'].iloc[r-1] = 0.
      positions['cum_return'].iloc[r] = 0.
    else:
      continue

  # show stats
  _m = positions['return'].mean() * 100 * 52
  _s = positions['return'].std() * 100 * np.sqrt(52)
  _negs = positions['neg_return'].std() * 100 * np.sqrt(52)
  print('-----------------------------------------------------------------')
  print('Backtest Results for '+target[0])
  print('Annualized Mean Return %:', round(_m,2))
  print('Annualized St Deviation %:', round(_s,2))
  print('Sharpe Ratio assuming rf = 0:', round(_m/_s,2))
  print('Max Drawdown %:', round(positions['max_drawdown'].min()*100,2))
  print('Cumulative Return %:', round(positions['cum_return'].iloc[-1]*100,2))
  print('Sortino Ratio assuming rf = 0:', round(_m/_negs, 2))
  print('Calmar Ratio', round(_m/(-positions['max_drawdown'].min()*100),2))
  print('-----------------------------------------------------------------')

  # show cumulative return
  plt.figure(figsize=(8,4))
  plt.plot(positions['cum_return'])
  #plt.plot(positions['cum_pnl']/positions['cum_pnl'][0])
  plt.title('Cumulative Return Multiple on US $1,000,000 for '+target[0])
  plt.xlabel('Date')
  plt.grid(True)
  plt.show()

  return positions

Target 1: Euribor 6th contract - 12th contract (3Y-1.5Y Yield Curve)¶

In [28]:
target_1 = ['ER6-12']
target_1_features = get_features(full_data_wf.loc[:'2023'],
                                     beg, end, target_1
)

# Get data
_X, _y = lag_feature(full_data_wf.loc[:'2023'],
                      target_1_features,
                      target_1
)

# Run Logit Model
X_, y_ = filter(_X, _y, beg, end
)

log_model = train_logit(X_, y_)

valid_X, valid_y = filter(_X, _y, end, end+52
)

score = valid_logit(valid_X,
                    valid_y,
                    log_model
)
print('Logit model score:', round(score,2))

# Run Random Forest Classification Model

rfc_model, rfc_score, rfc_n = train_valid_rfc(X_, y_, valid_X, valid_y)
print('Best F1 score for RFC:', round(rfc_score,2))
print('Best number of estimators:', rfc_n)

# Run SVC Model

svc_model, svc_score, svc_n = train_valid_svc(X_, y_, valid_X, valid_y)
print('Best F1 score for SVC:', round(svc_score,2))
print('Best C for SVC:', svc_n)
Logit model score: 0.31
Best F1 score for RFC: 0.33
Best number of estimators: 250
Best F1 score for SVC: 0.26
Best C for SVC: 651
In [29]:
# Uncomment to run code to generate tree splits
# an example of how Random Forest splits the data
#for i in range(1):
#    rf_tree = rfc_model.estimators_[i]
#    rf_data = export_graphviz(rf_tree,
#                               feature_names=valid_X.columns,
#                               filled=True,
#                               max_depth=2,
#                               impurity=False,
#                               proportion=True)
#    rf_graph = graphviz.Source(rf_data)
#    display(rf_graph)

Walk-forward analysis¶

In [30]:
target_1_wf_dict = dict() # dictionary to save scores
target_1_position = dict() # dictionary to save positions

# Get data
X, y = lag_feature(full_data_wf.loc[:'2023'],
                      target_1_features,
                      target_1
)

for idx in range(0, X.shape[0] - (52*(look_back+2))):

  _beg = idx
  _end = idx + (52*look_back)

  if idx % 13 == 0: # refresh models every quarter
    # Refresh features via updated correlations
    target_1_features = get_features(full_data_wf.loc[:'2023'],
                                     _beg, _end, target_1
    )

    # Get data
    X, y = lag_feature(full_data_wf.loc[:'2023'],
                          target_1_features,
                          target_1
    )
    # Run Logit Model
    _X, _y = filter(X, y, beg, end)

    _log_model = train_logit(_X, _y)

    _valid_X, _valid_y = filter(X, y, _end, _end+52)

    _score = valid_logit(_valid_X, _valid_y, _log_model)

    # Run Random Forest Classification Model
    _rfc_model, _rfc_score, _rfc_n = train_valid_rfc(_X, _y, _valid_X, _valid_y)

    # Run SVC Model
    _svc_model, _svc_score, _svc_n = train_valid_svc(_X, _y, _valid_X, _valid_y)

    # Save the scores indexed by ending year of training period
    target_1_wf_dict[X.index[beg]] = [round(_score,2), round(_rfc_score,2), round(_svc_score,2)]

  # Forecast
  _test_X, _test_y = filter(X, y, _end+52, _end+(52+1))

  _log_pred = _log_model.predict(_test_X)
  _rfc_pred = _rfc_model.predict(_test_X)
  _svc_pred = _svc_model.predict(_test_X)

  _mode = multimode([_log_pred[0], _rfc_pred[0], _svc_pred[0]])
  if len(_mode) > 1:
    _mode = [0]

  target_1_position[_test_X.index[0]] = _mode[0]

Backtest results¶

In [31]:
position_dict[target_1[0]] = get_bt_stats(target_1_position, target_1, DV01, aum)
-----------------------------------------------------------------
Backtest Results for ER6-12
Annualized Mean Return %: 15.85
Annualized St Deviation %: 20.64
Sharpe Ratio assuming rf = 0: 0.77
Max Drawdown %: -44.59
Cumulative Return %: 339.0
Sortino Ratio assuming rf = 0: 1.09
Calmar Ratio 0.36
-----------------------------------------------------------------
No description has been provided for this image

Target 2:Canada 10Y-2Y Yield Curve¶

In [32]:
target_2 = ['CA102']
target_2_features = get_features(full_data_wf.loc[:'2023'],
                                     beg, end, target_2
)

# Get data
_X, _y = lag_feature(full_data_wf.loc[:'2023'],
                      target_2_features,
                      target_2
)

# Run Logit Model
X_, y_ = filter(_X, _y, beg, end)

log_model = train_logit(X_, y_)

valid_X, valid_y = filter(_X, _y, end, end+52)

score = valid_logit(valid_X, valid_y, log_model)
print('Logit model score:', round(score,2))

# Run Random Forest Classification Model

rfc_model, rfc_score, rfc_n = train_valid_rfc(X_, y_, valid_X, valid_y)
print('Best F1 score for RFC:', round(rfc_score,2))
print('Best number of estimators:', rfc_n)

# Run SVC Model

svc_model, svc_score, svc_n = train_valid_svc(X_, y_, valid_X, valid_y)
print('Best F1 score for SVC:', round(svc_score,2))
print('Best C for SVC:', svc_n)
Logit model score: 0.4
Best F1 score for RFC: 0.37
Best number of estimators: 250
Best F1 score for SVC: 0.47
Best C for SVC: 651

Walk-forward analysis¶

In [33]:
target_2_wf_dict = dict() # dictionary to save scores
target_2_position = dict() # dictionary to save positions

# Get data
X, y = lag_feature(full_data_wf.loc[:'2023'],
                      target_2_features,
                      target_2
)

for idx in range(0, X.shape[0] - (52*(look_back+2))):

  _beg = idx
  _end = idx + (52*look_back)

  if idx % 13 == 0: # refresh models every quarter
    # Refresh features via updated correlations
    target_2_features = get_features(full_data_wf.loc[:'2023'],
                                     _beg, _end, target_2
    )
    # Get data
    X, y = lag_feature(full_data_wf.loc[:'2023'],
                          target_2_features,
                          target_2
    )

    # Run Logit Model
    _X, _y = filter(X, y, beg, end)

    _log_model = train_logit(_X, _y)

    _valid_X, _valid_y = filter(X, y, _end, _end+52)

    _score = valid_logit(_valid_X, _valid_y, _log_model)

    # Run Random Forest Classification Model
    _rfc_model, _rfc_score, _rfc_n = train_valid_rfc(_X, _y, _valid_X, _valid_y)

    # Run SVC Model
    _svc_model, _svc_score, _svc_n = train_valid_svc(_X, _y, _valid_X, _valid_y)

    # Save the scores indexed by ending year of training period
    target_2_wf_dict[X.index[beg]] = [round(_score,2), round(_rfc_score,2), round(_svc_score,2)]

  # Forecast
  _test_X, _test_y = filter(X, y, _end+52, _end+(52+1))

  _log_pred = _log_model.predict(_test_X)
  _rfc_pred = _rfc_model.predict(_test_X)
  _svc_pred = _svc_model.predict(_test_X)

  _mode = multimode([_log_pred[0], _rfc_pred[0], _svc_pred[0]])
  if len(_mode) > 1:
    _mode = [0]

  target_2_position[_test_X.index[0]] = _mode[0]

Backtest results¶

In [34]:
position_dict[target_2[0]] = get_bt_stats(target_2_position, target_2, DV01, aum)
-----------------------------------------------------------------
Backtest Results for CA102
Annualized Mean Return %: 16.37
Annualized St Deviation %: 35.79
Sharpe Ratio assuming rf = 0: 0.46
Max Drawdown %: -72.37
Cumulative Return %: 241.6
Sortino Ratio assuming rf = 0: 0.61
Calmar Ratio 0.23
-----------------------------------------------------------------
No description has been provided for this image

Target 3: US 5Y-2Y Yield Curve¶

In [35]:
target_3 = ['US52']
target_3_features = get_features(full_data_wf.loc[:'2023'],
                                     beg, end, target_3
)
# Get data
_X, _y = lag_feature(full_data_wf.loc[:'2023'],
                      target_3_features,
                      target_3
)

# Run Logit Model
X_, y_ = filter(_X, _y, beg, end)

log_model = train_logit(X_, y_)

valid_X, valid_y = filter(_X, _y, end, end+52)

score = valid_logit(valid_X, valid_y, log_model)
print('Logit model score:', round(score,2))

# Run Random Forest Classification Model

rfc_model, rfc_score, rfc_n = train_valid_rfc(X_, y_, valid_X, valid_y)
print('Best F1 score for RFC:', round(rfc_score,2))
print('Best number of estimators:', rfc_n)

# Run SVC Model

svc_model, svc_score, svc_n = train_valid_svc(X_, y_, valid_X, valid_y)
print('Best F1 score for SVC:', round(svc_score,2))
print('Best C for SVC:', svc_n)
Logit model score: 0.29
Best F1 score for RFC: 0.35
Best number of estimators: 150
Best F1 score for SVC: 0.32
Best C for SVC: 251

Walk-forward analysis¶

In [36]:
target_3_wf_dict = dict() # dictionary to save scores
target_3_position = dict() # dictionary to save positions

# Get data
X, y = lag_feature(full_data_wf.loc[:'2023'],
                      target_3_features,
                      target_3
)

for idx in range(0, X.shape[0] - (52*(look_back+2))):

  _beg = idx
  _end = idx + (52*look_back)

  if idx % 13 == 0: # refresh models every quarter
    # Refresh features via updated correlations
    target_3_features = get_features(full_data_wf.loc[:'2023'],
                                     _beg, _end, target_3
    )
    # Get data
    X, y = lag_feature(full_data_wf.loc[:'2023'],
                          target_3_features,
                          target_3
    )

    # Run Logit Model
    _X, _y = filter(X, y, beg, end)

    _log_model = train_logit(_X, _y)

    _valid_X, _valid_y = filter(X, y, _end, _end+52)

    _score = valid_logit(_valid_X, _valid_y, _log_model)

    # Run Random Forest Classification Model
    _rfc_model, _rfc_score, _rfc_n = train_valid_rfc(_X, _y, _valid_X, _valid_y)

    # Run SVC Model
    _svc_model, _svc_score, _svc_n = train_valid_svc(_X, _y, _valid_X, _valid_y)

    # Save the scores indexed by ending year of training period
    target_3_wf_dict[X.index[beg]] = [round(_score,2), round(_rfc_score,2), round(_svc_score,2)]

  # Forecast
  _test_X, _test_y = filter(X, y, _end+52, _end+(52+1)
  )

  _log_pred = _log_model.predict(_test_X)
  _rfc_pred = _rfc_model.predict(_test_X)
  _svc_pred = _svc_model.predict(_test_X)


  _mode = multimode([_log_pred[0], _rfc_pred[0], _svc_pred[0]])
  if len(_mode) > 1:
    _mode = [0]

  target_3_position[_test_X.index[0]] = _mode[0]

Backtest results¶

In [37]:
position_dict[target_3[0]] = get_bt_stats(target_3_position, target_3, DV01, aum)
-----------------------------------------------------------------
Backtest Results for US52
Annualized Mean Return %: 11.97
Annualized St Deviation %: 20.92
Sharpe Ratio assuming rf = 0: 0.57
Max Drawdown %: -37.48
Cumulative Return %: 238.96
Sortino Ratio assuming rf = 0: 0.86
Calmar Ratio 0.32
-----------------------------------------------------------------
No description has been provided for this image

Target 4: US 30Y-2Y Yield Curve¶

In [38]:
target_4 = ['US302']
target_4_features = get_features(full_data_wf.loc[:'2023'],
                                     beg, end, target_4
)
# Get data
_X, _y = lag_feature(full_data_wf.loc[:'2023'],
                      target_4_features,
                      target_4
)

# Run Logit Model
X_, y_ = filter(_X, _y, beg, end)

log_model = train_logit(X_, y_)

valid_X, valid_y = filter(_X, _y, end, end+52)

score = valid_logit(valid_X, valid_y, log_model)
print('Logit model score:', round(score,2))

# Run Random Forest Classification Model

rfc_model, rfc_score, rfc_n = train_valid_rfc(X_, y_, valid_X, valid_y)
print('Best F1 score for RFC:', round(rfc_score,2))
print('Best number of estimators:', rfc_n)

# Run SVC Model

svc_model, svc_score, svc_n = train_valid_svc(X_, y_, valid_X, valid_y)
print('Best F1 score for SVC:', round(svc_score,2))
print('Best C for SVC:', svc_n)
Logit model score: 0.38
Best F1 score for RFC: 0.4
Best number of estimators: 250
Best F1 score for SVC: 0.35
Best C for SVC: 451

Walk-forward analysis¶

In [39]:
target_4_wf_dict = dict() # dictionary to save scores
target_4_position = dict() # dictionary to save positions

# Get data
X, y = lag_feature(full_data_wf.loc[:'2023'],
                      target_4_features,
                      target_4
)

for idx in range(0, X.shape[0] - (52*(look_back+2))):

  _beg = idx
  _end = idx + (52*look_back)

  if idx % 13 == 0: # refresh models every quarter
    # Refresh features via updated correlations
    target_4_features = get_features(full_data_wf.loc[:'2023'],
                                     _beg, _end, target_4
    )
    # Get data
    X, y = lag_feature(full_data_wf.loc[:'2023'],
                          target_4_features,
                          target_4
    )

    # Run Logit Model
    _X, _y = filter(X, y, beg, end)

    _log_model = train_logit(_X, _y)

    _valid_X, _valid_y = filter(X, y, _end, _end+52)

    _score = valid_logit(_valid_X, _valid_y, _log_model)

    # Run Random Forest Classification Model
    _rfc_model, _rfc_score, _rfc_n = train_valid_rfc(_X, _y, _valid_X, _valid_y)

    # Run SVC Model
    _svc_model, _svc_score, _svc_n = train_valid_svc(_X, _y, _valid_X, _valid_y)

    # Save the scores indexed by ending year of training period
    target_4_wf_dict[X.index[beg]] = [round(_score,2), round(_rfc_score,2), round(_svc_score,2)]

  # Forecast
  _test_X, _test_y = filter(X, y, _end+52, _end+(52+1)
  )

  _log_pred = _log_model.predict(_test_X)
  _rfc_pred = _rfc_model.predict(_test_X)
  _svc_pred = _svc_model.predict(_test_X)

  _mode = multimode([_log_pred[0], _rfc_pred[0], _svc_pred[0]])
  if len(_mode) > 1:
    _mode = [0]

  target_4_position[_test_X.index[0]] = _mode[0]

Backtest results¶

In [40]:
position_dict[target_4[0]] = get_bt_stats(target_4_position, target_4, DV01, aum)
-----------------------------------------------------------------
Backtest Results for US302
Annualized Mean Return %: 20.07
Annualized St Deviation %: 24.2
Sharpe Ratio assuming rf = 0: 0.83
Max Drawdown %: -38.5
Cumulative Return %: 460.03
Sortino Ratio assuming rf = 0: 1.43
Calmar Ratio 0.52
-----------------------------------------------------------------
No description has been provided for this image

Target 5: US SOFR 4th contract - 8th contract (2Y-1Y Yield Curve)¶

In [41]:
target_5 = ['SFR4-8']
target_5_features = get_features(full_data_wf.loc[:'2023'],
                                     beg, end, target_5
)

# Get data
_X, _y = lag_feature(full_data_wf.loc[:'2023'],
                      target_5_features,
                      target_5
)

# Run Logit Model
X_, y_ = filter(_X, _y, beg, end)

log_model = train_logit(X_, y_)

valid_X, valid_y = filter(_X, _y, end, end+52)

score = valid_logit(valid_X, valid_y, log_model)
print('Logit model score:', round(score,2))

# Run Random Forest Classification Model

rfc_model, rfc_score, rfc_n = train_valid_rfc(X_, y_, valid_X, valid_y)
print('Best F1 score for RFC:', round(rfc_score,2))
print('Best number of estimators:', rfc_n)

# Run SVC Model

svc_model, svc_score, svc_n = train_valid_svc(X_, y_, valid_X, valid_y)
print('Best F1 score for SVC:', round(svc_score,2))
print('Best C for SVC:', svc_n)
Logit model score: 0.42
Best F1 score for RFC: 0.42
Best number of estimators: 100
Best F1 score for SVC: 0.46
Best C for SVC: 151

Walk-forward analysis¶

In [42]:
target_5_wf_dict = dict() # dictionary to save scores
target_5_position = dict() # dictionary to save positions

# Get data
X, y = lag_feature(full_data_wf.loc[:'2023'],
                      target_5_features,
                      target_5
)

for idx in range(0, X.shape[0] - (52*(look_back+2))):

  _beg = idx
  _end = idx + (52*look_back)

  if idx % 13 == 0: # refresh models every quarter
    # Refresh features via updated correlations
    target_5_features = get_features(full_data_wf.loc[:'2023'],
                                     _beg, _end, target_5
    )
    # Get data
    X, y = lag_feature(full_data_wf.loc[:'2023'],
                          target_5_features,
                          target_5
    )

    # Run Logit Model
    _X, _y = filter(X, y, beg, end)

    _log_model = train_logit(_X, _y)

    _valid_X, _valid_y = filter(X, y, _end, _end+52)

    _score = valid_logit(_valid_X, _valid_y, _log_model)

    # Run Random Forest Classification Model
    _rfc_model, _rfc_score, _rfc_n = train_valid_rfc(_X, _y, _valid_X, _valid_y)

    # Run SVC Model
    _svc_model, _svc_score, _svc_n = train_valid_svc(_X, _y, _valid_X, _valid_y)

    # Save the scores indexed by ending year of training period
    target_5_wf_dict[X.index[beg]] = [round(_score,2), round(_rfc_score,2), round(_svc_score,2)]

  # Forecast
  _test_X, _test_y = filter(X, y, _end+52, _end+(52+1))

  _log_pred = _log_model.predict(_test_X)
  _rfc_pred = _rfc_model.predict(_test_X)
  _svc_pred = _svc_model.predict(_test_X)

  _mode = multimode([_log_pred[0], _rfc_pred[0], _svc_pred[0]])
  if len(_mode) > 1:
    _mode = [0]

  target_5_position[_test_X.index[0]] = _mode[0]

Backtest results¶

In [43]:
position_dict[target_5[0]] = get_bt_stats(target_5_position, target_5, DV01, aum)
-----------------------------------------------------------------
Backtest Results for SFR4-8
Annualized Mean Return %: 19.98
Annualized St Deviation %: 38.94
Sharpe Ratio assuming rf = 0: 0.51
Max Drawdown %: -57.86
Cumulative Return %: 304.0
Sortino Ratio assuming rf = 0: 0.67
Calmar Ratio 0.35
-----------------------------------------------------------------
No description has been provided for this image

Target 6: US 10Y-2Y Yield Curve¶

In [44]:
target_6 = ['US102']
target_6_features = get_features(full_data_wf.loc[:'2023'],
                                     beg, end, target_6
)
# Get data
_X, _y = lag_feature(full_data_wf.loc[:'2023'],
                      target_6_features,
                      target_6
)

# Run Logit Model
X_, y_ = filter(_X, _y, beg, end)

log_model = train_logit(X_, y_)

valid_X, valid_y = filter(_X, _y, end, end+52)

score = valid_logit(valid_X, valid_y, log_model)
print('Logit model score:', round(score,2))

# Run Random Forest Classification Model

rfc_model, rfc_score, rfc_n = train_valid_rfc(X_, y_, valid_X, valid_y)
print('Best F1 score for RFC:', round(rfc_score,2))
print('Best number of estimators:', rfc_n)

# Run SVC Model

svc_model, svc_score, svc_n = train_valid_svc(X_, y_, valid_X, valid_y)
print('Best F1 score for SVC:', round(svc_score,2))
print('Best C for SVC:', svc_n)
Logit model score: 0.35
Best F1 score for RFC: 0.2
Best number of estimators: 250
Best F1 score for SVC: 0.34
Best C for SVC: 751

Walk-forward analysis¶

In [45]:
target_6_wf_dict = dict() # dictionary to save scores
target_6_position = dict() # dictionary to save positions

# Get data
X, y = lag_feature(full_data_wf.loc[:'2023'],
                      target_6_features,
                      target_6
)

for idx in range(0, X.shape[0] - (52*(look_back+2))):

  _beg = idx
  _end = idx + (52*look_back)

  if idx % 13 == 0: # refresh models every quarter
    # Refresh features via updated correlations
    target_6_features = get_features(full_data_wf.loc[:'2023'],
                                     _beg, _end, target_6
    )
    # Get data
    X, y = lag_feature(full_data_wf.loc[:'2023'],
                          target_6_features,
                          target_6
    )

    # Run Logit Model
    _X, _y = filter(X, y, beg, end)

    _log_model = train_logit(_X, _y)

    _valid_X, _valid_y = filter(X, y, _end, _end+52)

    _score = valid_logit(_valid_X, _valid_y, _log_model)

    # Run Random Forest Classification Model
    _rfc_model, _rfc_score, _rfc_n = train_valid_rfc(_X, _y, _valid_X, _valid_y)

    # Run SVC Model
    _svc_model, _svc_score, _svc_n = train_valid_svc(_X, _y, _valid_X, _valid_y)

    # Save the scores indexed by ending year of training period
    target_6_wf_dict[X.index[beg]] = [round(_score,2), round(_rfc_score,2), round(_svc_score,2)]

  # Forecast
  _test_X, _test_y = filter(X, y, _end+52, _end+(52+1)
  )

  _log_pred = _log_model.predict(_test_X)
  _rfc_pred = _rfc_model.predict(_test_X)
  _svc_pred = _svc_model.predict(_test_X)

  _mode = multimode([_log_pred[0], _rfc_pred[0], _svc_pred[0]])
  if len(_mode) > 1:
    _mode = [0]

  target_6_position[_test_X.index[0]] = _mode[0]

Backtest results¶

In [46]:
position_dict[target_6[0]] = get_bt_stats(target_6_position, target_6, DV01, aum)
-----------------------------------------------------------------
Backtest Results for US102
Annualized Mean Return %: 18.83
Annualized St Deviation %: 16.33
Sharpe Ratio assuming rf = 0: 1.15
Max Drawdown %: -24.88
Cumulative Return %: 474.32
Sortino Ratio assuming rf = 0: 2.09
Calmar Ratio 0.76
-----------------------------------------------------------------
No description has been provided for this image

Target 7: German 10Y-2Y Yield Curve¶

In [47]:
target_7 = ['DE102']
target_7_features = get_features(full_data_wf.loc[:'2023'],
                                     beg, end, target_7
)
# Get data
_X, _y = lag_feature(full_data_wf.loc[:'2023'],
                      target_7_features,
                      target_7
)

# Run Logit Model
X_, y_ = filter(_X, _y, beg, end)

log_model = train_logit(X_, y_)

valid_X, valid_y = filter(_X, _y, end, end+52)

score = valid_logit(valid_X, valid_y, log_model)
print('Logit model score:', round(score,2))

# Run Random Forest Classification Model

rfc_model, rfc_score, rfc_n = train_valid_rfc(X_, y_, valid_X, valid_y)
print('Best F1 score for RFC:', round(rfc_score,2))
print('Best number of estimators:', rfc_n)

# Run SVC Model

svc_model, svc_score, svc_n = train_valid_svc(X_, y_, valid_X, valid_y)
print('Best F1 score for SVC:', round(svc_score,2))
print('Best C for SVC:', svc_n)
Logit model score: 0.37
Best F1 score for RFC: 0.29
Best number of estimators: 100
Best F1 score for SVC: 0.34
Best C for SVC: 301

Walk-forward analysis¶

In [48]:
target_7_wf_dict = dict() # dictionary to save scores
target_7_position = dict() # dictionary to save positions

# Get data
X, y = lag_feature(full_data_wf.loc[:'2023'],
                      target_7_features,
                      target_7
)

for idx in range(0, X.shape[0] - (52*(look_back+2))):

  _beg = idx
  _end = idx + (52*look_back)

  if idx % 13 == 0: # refresh models every quarter
    # Refresh features via updated correlations
    target_7_features = get_features(full_data_wf.loc[:'2023'],
                                     _beg, _end, target_7
    )

    # Get data
    X, y = lag_feature(full_data_wf.loc[:'2023'],
                          target_7_features,
                          target_7
    )

    # Run Logit Model
    _X, _y = filter(X, y, beg, end)

    _log_model = train_logit(_X, _y)

    _valid_X, _valid_y = filter(X, y, _end, _end+52)

    _score = valid_logit(_valid_X, _valid_y, _log_model)

    # Run Random Forest Classification Model
    _rfc_model, _rfc_score, _rfc_n = train_valid_rfc(_X, _y, _valid_X, _valid_y)

    # Run SVC Model
    _svc_model, _svc_score, _svc_n = train_valid_svc(_X, _y, _valid_X, _valid_y)

    # Save the scores indexed by ending year of training period
    target_7_wf_dict[X.index[beg]] = [round(_score,2), round(_rfc_score,2), round(_svc_score,2)]

  # Forecast
  _test_X, _test_y = filter(X, y, _end+52, _end+(52+1)
  )

  _log_pred = _log_model.predict(_test_X)
  _rfc_pred = _rfc_model.predict(_test_X)
  _svc_pred = _svc_model.predict(_test_X)

  _mode = multimode([_log_pred[0], _rfc_pred[0], _svc_pred[0]])
  if len(_mode) > 1:
    _mode = [0]

  target_7_position[_test_X.index[0]] = _mode[0]

Backtest results¶

In [49]:
position_dict[target_7[0]] = get_bt_stats(target_7_position, target_7, DV01, aum)
-----------------------------------------------------------------
Backtest Results for DE102
Annualized Mean Return %: 19.16
Annualized St Deviation %: 16.98
Sharpe Ratio assuming rf = 0: 1.13
Max Drawdown %: -24.96
Cumulative Return %: 483.8
Sortino Ratio assuming rf = 0: 1.76
Calmar Ratio 0.77
-----------------------------------------------------------------
No description has been provided for this image

Target 8: German 5Y-2Y Yield Curve¶

In [50]:
target_8 = ['DE52']
target_8_features = get_features(full_data_wf.loc[:'2023'],
                                     beg, end, target_8
)

# Get data
_X, _y = lag_feature(full_data_wf.loc[:'2023'],
                      target_8_features,
                      target_8
)

# Run Logit Model
X_, y_ = filter(_X, _y, beg, end)

log_model = train_logit(X_, y_)

valid_X, valid_y = filter(_X, _y, end, end+52)

score = valid_logit(valid_X, valid_y, log_model)
print('Logit model score:', round(score,2))

# Run Random Forest Classification Model

rfc_model, rfc_score, rfc_n = train_valid_rfc(X_, y_, valid_X, valid_y)
print('Best F1 score for RFC:', round(rfc_score,2))
print('Best number of estimators:', rfc_n)

# Run SVC Model

svc_model, svc_score, svc_n = train_valid_svc(X_, y_, valid_X, valid_y)
print('Best F1 score for SVC:', round(svc_score,2))
print('Best C for SVC:', svc_n)
Logit model score: 0.37
Best F1 score for RFC: 0.26
Best number of estimators: 150
Best F1 score for SVC: 0.35
Best C for SVC: 1

Walk-forward analysis¶

In [51]:
target_8_wf_dict = dict() # dictionary to save scores
target_8_position = dict() # dictionary to save positions

# Get data
X, y = lag_feature(full_data_wf.loc[:'2023'],
                      target_8_features,
                      target_8
)

for idx in range(0, X.shape[0] - (52*(look_back+2))):

  _beg = idx
  _end = idx + (52*look_back)

  if idx % 13 == 0: # refresh models every quarter
    # Refresh features via updated correlations
    target_8_features = get_features(full_data_wf.loc[:'2023'],
                                     _beg, _end, target_8
    )

    # Get data
    X, y = lag_feature(full_data_wf.loc[:'2023'],
                          target_8_features,
                          target_8
    )

    # Run Logit Model
    _X, _y = filter(X, y, beg, end)

    _log_model = train_logit(_X, _y)

    _valid_X, _valid_y = filter(X, y, _end, _end+52)

    _score = valid_logit(_valid_X, _valid_y, _log_model)

    # Run Random Forest Classification Model
    _rfc_model, _rfc_score, _rfc_n = train_valid_rfc(_X, _y, _valid_X, _valid_y)

    # Run SVC Model
    _svc_model, _svc_score, _svc_n = train_valid_svc(_X, _y, _valid_X, _valid_y)

    # Save the scores indexed by ending year of training period
    target_8_wf_dict[X.index[beg]] = [round(_score,2), round(_rfc_score,2), round(_svc_score,2)]

  # Forecast
  _test_X, _test_y = filter(X, y, _end+52, _end+(52+1)
  )

  _log_pred = _log_model.predict(_test_X)
  _rfc_pred = _rfc_model.predict(_test_X)
  _svc_pred = _svc_model.predict(_test_X)

  _mode = multimode([_log_pred[0], _rfc_pred[0], _svc_pred[0]])
  if len(_mode) > 1:
    _mode = [0]

  target_8_position[_test_X.index[0]] = _mode[0]

Backtest results¶

In [52]:
position_dict[target_8[0]] = get_bt_stats(target_8_position, target_8, DV01, aum)
-----------------------------------------------------------------
Backtest Results for DE52
Annualized Mean Return %: 15.16
Annualized St Deviation %: 30.91
Sharpe Ratio assuming rf = 0: 0.49
Max Drawdown %: -45.62
Cumulative Return %: 253.3
Sortino Ratio assuming rf = 0: 0.72
Calmar Ratio 0.33
-----------------------------------------------------------------
No description has been provided for this image

Target 9: Italy 10Y - German 10Y¶

In [53]:
def make_pca(df, beg, end):
  cds = df.ffill().iloc[beg : end].diff().dropna().values
  scaler = StandardScaler()
  scaled_cds = scaler.fit_transform(cds)

  pca = PCA(n_components=3)

  pc = pca.fit_transform(scaled_cds)

  # multiply PCA loadings with scaled underlying data
  scaled_cds_array = np.array(scaled_cds)
  scaled_loadings = np.dot(scaled_cds_array, pca.components_.T)

  # multiply "scaled loadings" with principal component weights, respectively
  # to create one index that combines all three principal components
  # this index had to be negated to make it comparable with underlying CDS data

  EZ_risk_index = -np.dot(scaled_loadings, pca.explained_variance_)
  EZ_risk = pd.DataFrame(EZ_risk_index.cumsum(),
                         index=df.iloc[beg : end].index[1:]
  )
  EZ_risk.columns = ['EZ Risk Index']
  return EZ_risk/100

def lag_feature_pca(df, features_df, features2, target, beg, end, _beg):
  '''
  Function takes a pandas dataframe and inserts lags of all columns.
  Returns a dataframe with all lags inserted, first differenced,
  and a target series as a categorical variable.
  '''
  df_ = df[features2 + target].iloc[beg : end].ffill().copy()
  pca_df = make_pca(features_df, beg, end)
  df_ = pd.concat([df_, pca_df], axis=1)


  tmp_dict = dict()
  for col in df_.columns:
    for l in range(1,5):
      tmp_col = df_[col].shift(l).copy()
      tmp_dict[col+'L'+str(l)] = tmp_col
  df_ = pd.concat([df_,
                 pd.DataFrame.from_dict(tmp_dict)],
                 axis=1
  )
  df_[target] = df_[target].shift(-1)

  df_.dropna(inplace=True)

  X = df_.diff()[1:].ffill().iloc[_beg:].drop(target, axis=1)
  y = df_[target].diff()[1:].ffill().iloc[_beg:]

  upper = 0.03
  lower = -0.03
  y = pd.DataFrame(np.where(y > upper, 1, np.where(y < lower, -1, 0)))
  return X, y
In [54]:
target_9 = ['ITDE10']
target_9_features = file_dict['ez_cds'].resample('W-FRI').last().copy()

target_9_features2 = get_features(full_data_wf.loc[:'2023'],
                                     beg, end, target_9
)

# Get data
X_, y_ = lag_feature_pca(full_data_wf.loc[:'2023'],
                      target_9_features,target_9_features2,
                      target_9, beg, end, 0
)

# Run Logit Model

log_model = train_logit(X_, y_)


valid_X, valid_y = lag_feature_pca(full_data_wf.loc[:'2023'],
                      target_9_features,target_9_features2,
                      target_9, beg, end+52, end
)

#valid_X, valid_y = filter(_X, _y, end, end+52)

score = valid_logit(valid_X, valid_y, log_model)
print('Logit model score:', round(score,2))

#print(_X)
#print(_y)
Logit model score: 0.33
In [55]:
target_9 = ['ITDE10']
target_9_features = file_dict['ez_cds'].resample('W-FRI').last().copy()
#target_9_features2 = feature_dict[target_9[0]]

target_9_features2 = get_features(full_data_wf.loc[:'2023'],
                                     beg, end, target_9
)

# Note that this process is slightly different given the PCA feature

# Get data
X_, y_ = lag_feature_pca(full_data_wf.loc[:'2023'],
                      target_9_features,target_9_features2,
                      target_9, beg, end, 0
)

# Run Logit Model

log_model = train_logit(X_, y_)


valid_X, valid_y  = lag_feature_pca(full_data_wf.loc[:'2023'],
                      target_9_features,target_9_features2,
                      target_9, beg, end+52, -52
)
#valid_X, valid_y = filter(X_, y_, end, end+52)

score = valid_logit(valid_X, valid_y, log_model)
print('Logit model score:', round(score,2))

# Run Random Forest Classification Model

rfc_model, rfc_score, rfc_n = train_valid_rfc(X_, y_, valid_X, valid_y)
print('Best F1 score for RFC:', round(rfc_score,2))
print('Best number of estimators:', rfc_n)

# Run SVC Model

svc_model, svc_score, svc_n = train_valid_svc(X_, y_, valid_X, valid_y)
print('Best F1 score for SVC:', round(svc_score,2))
print('Best C for SVC:', svc_n)

print()
Logit model score: 0.33
Best F1 score for RFC: 0.32
Best number of estimators: 100
Best F1 score for SVC: 0.47
Best C for SVC: 351

Walk-forward analysis¶

In [56]:
target_9_wf_dict = dict() # dictionary to save scores
target_9_position = dict() # dictionary to save positions

# Get data
_X, _y = lag_feature_pca(full_data_wf.loc[:'2023'],
                    target_9_features,target_9_features2,
                    target_9, 0, 0+(52*look_back), 0
)

for idx in range(0, full_data_wf.loc[:'2023'].shape[0] - (52*(look_back+2))):

  _beg = idx
  _end = idx + (52*look_back)

  if idx % 13 == 0: # refresh models every quarter
    # Refresh features via updated correlations
    target_9_features2 = get_features(full_data_wf.loc[:'2023'],
                                     _beg, _end, target_9
    )

    if 'EZ Risk Index' in target_9_features2:
      target_9_features2.remove('EZ Risk Index')

    # Get data
    _X, _y = lag_feature_pca(full_data_wf.loc[:'2023'],
                          target_9_features, target_9_features2,
                          target_9, _beg, _end, 0
    )

    # Run Logit Model

    _log_model = train_logit(_X, _y)


    _valid_X, _valid_y  = lag_feature_pca(full_data_wf.loc[:'2023'],
                      target_9_features,target_9_features2,
                      target_9, _beg, _end+52, -52
    )

    _score = valid_logit(_valid_X, _valid_y, _log_model)

    # Run Random Forest Classification Model
    _rfc_model, _rfc_score, _rfc_n = train_valid_rfc(_X, _y, _valid_X, _valid_y)

    # Run SVC Model
    _svc_model, _svc_score, _svc_n = train_valid_svc(_X, _y, _valid_X, _valid_y)

    # Save the scores indexed by ending year of training period
    target_9_wf_dict[X.index[beg]] = [round(_score,2), round(_rfc_score,2), round(_svc_score,2)]

  # Forecast
  _test_X, _test_y  = lag_feature_pca(full_data_wf.loc[:'2023'],
                      target_9_features,target_9_features2,
                      target_9, _beg, _end+52+1, -5
  )

  _log_pred = _log_model.predict(_test_X.iloc[[-1]])
  _rfc_pred = _rfc_model.predict(_test_X.iloc[[-1]])
  _svc_pred = _svc_model.predict(_test_X.iloc[[-1]])

  _mode = multimode([_log_pred[0], _rfc_pred[0], _svc_pred[0]])
  if len(_mode) > 1:
    _mode = [0]

  target_9_position[_test_X.iloc[[-1]].index[0]] = _mode[0]

Backtest results¶

In [57]:
position_dict[target_9[0]] = get_bt_stats(target_9_position, target_9, DV01, aum)
-----------------------------------------------------------------
Backtest Results for ITDE10
Annualized Mean Return %: 679.35
Annualized St Deviation %: 2157.53
Sharpe Ratio assuming rf = 0: 0.31
Max Drawdown %: -347.3
Cumulative Return %: 0.0
Sortino Ratio assuming rf = 0: 0.47
Calmar Ratio 1.96
-----------------------------------------------------------------
No description has been provided for this image

Portfolio Construction¶

Equal Weight¶

In [58]:
strat_returns = pd.DataFrame()
for k in position_dict.keys():
  if k == 'ITDE10': # skip this strategy because backtest results were poor
    continue
  _ = pd.DataFrame.from_dict(position_dict[k][['return']])
  _.columns = [k]
  strat_returns = pd.concat([strat_returns, _], axis=1)

# create an equal-weighted portfolio
eq_w_pf = pd.DataFrame(strat_returns.mean(axis=1), columns=['return']).fillna(0)

# calculate performance statistics
eq_w_pf['neg_return'] = eq_w_pf['return'].where(eq_w_pf['return'] < 0, np.nan)
eq_w_pf['cum_return'] = np.cumprod(1 + eq_w_pf['return'].fillna(0).values)

eq_w_pf['rolling_max'] = eq_w_pf['cum_return'].cummax()
eq_w_pf['weekly_drawdown'] = eq_w_pf['cum_return']/eq_w_pf['rolling_max'] - 1.0
eq_w_pf['max_drawdown'] = eq_w_pf['weekly_drawdown'].cummin()

# repeat 0 for return if portfolio wipes out
for r in range(2, eq_w_pf.shape[0]):
  if round(eq_w_pf['cum_return'].iloc[r-1],2) <= 0.0:
    eq_w_pf['cum_return'].iloc[r-1] = 0.
    eq_w_pf['cum_return'].iloc[r] = 0.
  else:
    continue
eq_w_pf['cum_return'].plot(title='Equal Weighted Portfolio - Cumulative Return')
plt.show()

# show stats
eq_m = eq_w_pf['return'].mean() * 100 * 52
eq_s = eq_w_pf['return'].std() * 100 * np.sqrt(52)
eq_negs = eq_w_pf['neg_return'].std() * 100 * np.sqrt(52)
print('-----------------------------------------------------------------')
print('Backtest Results for Equal Weighted Portfolio')
print('Annualized Mean Return %:', round(eq_m,2))
print('Annualized St Deviation %:', round(eq_s,2))
print('Sharpe Ratio assuming rf = 0:', round(eq_m/eq_s,2))
print('Max Drawdown %:', round(eq_w_pf['max_drawdown'].min()*100,2))
print('Cumulative Return %:', round(eq_w_pf['cum_return'].iloc[-1]*100,2))
print('Sortino Ratio assuming rf = 0:', round(eq_m/eq_negs, 2))
print('Calmar Ratio', round(eq_m/(-eq_w_pf['max_drawdown'].min()*100),2))
print('-----------------------------------------------------------------')
No description has been provided for this image
-----------------------------------------------------------------
Backtest Results for Equal Weighted Portfolio
Annualized Mean Return %: 17.14
Annualized St Deviation %: 14.59
Sharpe Ratio assuming rf = 0: 1.17
Max Drawdown %: -23.6
Cumulative Return %: 418.83
Sortino Ratio assuming rf = 0: 1.72
Calmar Ratio 0.73
-----------------------------------------------------------------

HRP¶

The HRP process is updated on an annual basis, i.e., annual portfolio optimization of weights. The input is the prior year's worth of weekly returns for the eight final spreads assuming a portfolio that was long all of the spreads.

In [64]:
%%capture
# line above is to silence some warning messages from Pandas

hrp_wts = pd.DataFrame()
for year in range(2014,2023):

  strategies = full_data_wf[strat_returns.columns].diff().loc[str(year-1)].fillna(0) * 100 * DV01
  strategies.iloc[0] = aum
  strategies_cum_pnl = strategies.cumsum()
  strategies_rets = strategies_cum_pnl.pct_change()[1:]

  # Hierarchical risk parity:
  # Get initial weights using spread returns from prior year
  #   assuming a portfolio long all of the 8 spreads
  hrp = HRPOpt(strategies_rets)
  weights = hrp.optimize()
  hrp.portfolio_performance(verbose=False) # minimize printouts
  hrp_wts = pd.concat([hrp_wts,
                       pd.DataFrame.from_dict(weights, orient='index').T], axis=0)

  # show how dendrogram changes from beginning to end
  if year in [2014, 2022]:
    print('Year:',year)
    print('Weights:')
    print(pd.DataFrame.from_dict(weights,
                                 orient='index',
                                 columns=['Weights']).sort_values(by='Weights',
                                                          ascending=False)
    )
    plotting.plot_dendrogram(hrp)  # to plot dendrogram
    plt.show()

hrp_wts.set_index([pd.date_range(start=strat_returns.index[0],
                                 end='2023',periods=9)], inplace=True
)
In [65]:
# convert HRP weights from annual to Week at Friday frequency
hrp_wts_wf = hrp_wts.resample('W-FRI').last().ffill()
hrp_wts_wf.head()
Out[65]:
CA102 DE102 DE52 ER6-12 SFR4-8 US102 US302 US52
2014-02-07 0.195866 0.126547 0.161098 0.108201 0.089734 0.084339 0.08607 0.148145
2014-02-14 0.195866 0.126547 0.161098 0.108201 0.089734 0.084339 0.08607 0.148145
2014-02-21 0.195866 0.126547 0.161098 0.108201 0.089734 0.084339 0.08607 0.148145
2014-02-28 0.195866 0.126547 0.161098 0.108201 0.089734 0.084339 0.08607 0.148145
2014-03-07 0.195866 0.126547 0.161098 0.108201 0.089734 0.084339 0.08607 0.148145
In [66]:
# create HRP-optimized portfolio
hrp_pf = hrp_wts_wf[strat_returns.columns].loc[strat_returns.index].multiply(strat_returns).sum(axis=1)
hrp_pf = pd.DataFrame(hrp_pf, columns=['return'])

# calculate performance statistics
hrp_pf['neg_return'] = hrp_pf['return'].where(hrp_pf['return'] < 0, np.nan)
hrp_pf['cum_return'] = np.cumprod(1 + hrp_pf['return'].fillna(0).values)

hrp_pf['rolling_max'] = hrp_pf['cum_return'].cummax()
hrp_pf['weekly_drawdown'] = hrp_pf['cum_return']/eq_w_pf['rolling_max'] - 1.0
hrp_pf['max_drawdown'] = hrp_pf['weekly_drawdown'].cummin()

# repeat 0 for return if portfolio wipes out
for r in range(2, hrp_pf.shape[0]):
  if round(hrp_pf['cum_return'].iloc[r-1],2) <= 0.0:
    hrp_pf['cum_return'].iloc[r-1] = 0.
    hrp_pf['cum_return'].iloc[r] = 0.
  else:
    continue
hrp_pf['cum_return'].plot(title='Hierarchical Risk Parity Portfolio - Cumulative Return')
plt.show()

# show stats
hrp_m = hrp_pf['return'].mean() * 100 * 52
hrp_s = hrp_pf['return'].std() * 100 * np.sqrt(52)
hrp_negs = hrp_pf['neg_return'].std() * 100 * np.sqrt(52)
print('-----------------------------------------------------------------')
print('Backtest Results for Hierarchical Risk Parity Portfolio')
print('Annualized Mean Return %:', round(hrp_m,2))
print('Annualized St Deviation %:', round(hrp_s,2))
print('Sharpe Ratio assuming rf = 0:', round(hrp_m/hrp_s,2))
print('Max Drawdown %:', round(hrp_pf['max_drawdown'].min()*100,2))
print('Cumulative Return %:', round(hrp_pf['cum_return'].iloc[-1]*100,2))
print('Sortino Ratio assuming rf = 0:', round(hrp_m/hrp_negs, 2))
print('Calmor Ratio', round(hrp_m/(-hrp_pf['max_drawdown'].min()*100),2))
print('-----------------------------------------------------------------')
No description has been provided for this image
-----------------------------------------------------------------
Backtest Results for Hierarchical Risk Parity Portfolio
Annualized Mean Return %: 18.67
Annualized St Deviation %: 17.24
Sharpe Ratio assuming rf = 0: 1.08
Max Drawdown %: -33.9
Cumulative Return %: 462.22
Sortino Ratio assuming rf = 0: 1.58
Calmor Ratio 0.55
-----------------------------------------------------------------